Lab 03¶

Student Name: Aagnay Kariyal¶

Student ID: 8830232¶

. . .

Data Preprocessing Tasks¶


1) Detecting and Handling Outliers¶

In [ ]:
# Importing all the dependencies
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
In [ ]:
# Inserting the data from the csv into a dataframe
data = pd.read_csv('/Users/aagnaykariyal/Documents/Conestoga/AI Algorithms/Lab 03/creditcard.csv')
data
Out[ ]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
284802 172786.0 -11.881118 10.071785 -9.834783 -2.066656 -5.364473 -2.606837 -4.918215 7.305334 1.914428 ... 0.213454 0.111864 1.014480 -0.509348 1.436807 0.250034 0.943651 0.823731 0.77 0
284803 172787.0 -0.732789 -0.055080 2.035030 -0.738589 0.868229 1.058415 0.024330 0.294869 0.584800 ... 0.214205 0.924384 0.012463 -1.016226 -0.606624 -0.395255 0.068472 -0.053527 24.79 0
284804 172788.0 1.919565 -0.301254 -3.249640 -0.557828 2.630515 3.031260 -0.296827 0.708417 0.432454 ... 0.232045 0.578229 -0.037501 0.640134 0.265745 -0.087371 0.004455 -0.026561 67.88 0
284805 172788.0 -0.240440 0.530483 0.702510 0.689799 -0.377961 0.623708 -0.686180 0.679145 0.392087 ... 0.265245 0.800049 -0.163298 0.123205 -0.569159 0.546668 0.108821 0.104533 10.00 0
284806 172792.0 -0.533413 -0.189733 0.703337 -0.506271 -0.012546 -0.649617 1.577006 -0.414650 0.486180 ... 0.261057 0.643078 0.376777 0.008797 -0.473649 -0.818267 -0.002415 0.013649 217.00 0

284807 rows × 31 columns

Using the whisker approach

In [ ]:
# Creating a function to find if there are any outliers using the box whisker approach.
def clean(var):
    Q1 = data[var].quantile(0.25)
    Q3 = data[var].quantile(0.75)
    IQR = Q3 - Q1
    
    upper = Q3 + (1.5 * IQR)
    lower = Q1 - (1.5 * IQR)
    
    upper_values = np.array(data[var] >= upper)
    lower_values = np.array(data[var] <= lower)
    upper_count = np.prod(upper_values.shape)
    lower_count = np.prod(lower_values.shape)
    result = None
    
    if upper_count & lower_count == 0:
        return f"There are no outliers in the column {var}"
    else:
        upper_values = np.where(data[var] >= upper)[0]
        lower_values = np.where(data[var] <= lower)[0]
        
        data[var] = data[var].drop(index=upper_values, inplace=None)
        data[var] = data[var].drop(index=lower_values, inplace=None)
        return "The column is now cleaned"
In [ ]:
col_names = data.columns # Getting all the column names in the dataframe
col_names = col_names.drop('Class') # Dropping class as it only has two values to detect fraudulent data
for names in col_names: # Iterating through all the columns except class to clean the data
    print(names)
    print(clean(names))
Time
The column is now cleaned
V1
The column is now cleaned
V2
The column is now cleaned
V3
The column is now cleaned
V4
The column is now cleaned
V5
The column is now cleaned
V6
The column is now cleaned
V7
The column is now cleaned
V8
The column is now cleaned
V9
The column is now cleaned
V10
The column is now cleaned
V11
The column is now cleaned
V12
The column is now cleaned
V13
The column is now cleaned
V14
The column is now cleaned
V15
The column is now cleaned
V16
The column is now cleaned
V17
The column is now cleaned
V18
The column is now cleaned
V19
The column is now cleaned
V20
The column is now cleaned
V21
The column is now cleaned
V22
The column is now cleaned
V23
The column is now cleaned
V24
The column is now cleaned
V25
The column is now cleaned
V26
The column is now cleaned
V27
The column is now cleaned
V28
The column is now cleaned
Amount
The column is now cleaned
In [ ]:
print(data.shape)
data = data.dropna() # Removing all the null values from cleaning the data
print(data.shape)
(284807, 31)
(146334, 31)

Descriptive Analytical Tasks¶


  1. Analyzing the distribution of Fraud and Non-Fraud transactions
In [ ]:
data['Class'].unique() # Means it is a Fraud, 0 means it is not
# Converting the numerical values into categorical for easier analysis
counts = data['Class'].value_counts()
print(counts)
Class
0    146319
1        15
Name: count, dtype: int64

From the above data we can see that there are only 15 Fraudulent cases wheras there are 1,46,319 Non-Fraudulent cases

  1. Analyzing the correlation between the Class feature and all the other features
In [ ]:
c_matrix = data.corr()
c_matrix.head(30)
Out[ ]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
Time 1.000000 0.243302 -0.079032 -0.584282 -0.206164 0.274393 -0.170560 0.111473 -0.186010 0.146934 ... 0.072079 0.112380 0.239050 -0.000278 -0.326817 -0.055266 -0.067510 -0.202418 -0.028983 -0.004837
V1 0.243302 1.000000 -0.607645 -0.653576 0.139793 -0.302659 -0.068908 -0.433885 -0.452119 0.288310 ... -0.085250 -0.006150 0.359266 -0.011529 0.150539 -0.006017 -0.266911 -0.388033 0.007758 -0.002218
V2 -0.079032 -0.607645 1.000000 0.182312 0.280412 0.584172 -0.142545 0.769223 0.130582 -0.073881 ... -0.042753 -0.115434 -0.265716 0.041263 -0.091146 0.016220 0.205829 0.298994 -0.391074 0.007927
V3 -0.584282 -0.653576 0.182312 1.000000 0.039967 -0.269626 0.330173 -0.086539 0.417561 -0.122859 ... -0.007480 -0.027404 -0.251142 0.099846 0.002230 -0.028376 0.260271 0.342201 -0.002391 0.001260
V4 -0.206164 0.139793 0.280412 0.039967 1.000000 0.112006 0.184414 0.101914 0.048369 0.283920 ... -0.011274 -0.038167 0.020770 0.081798 0.190120 -0.101336 -0.044333 0.012608 -0.044219 0.009452
V5 0.274393 -0.302659 0.584172 -0.269626 0.112006 1.000000 -0.128285 0.782008 -0.196368 -0.043302 ... 0.002381 -0.057916 -0.234678 -0.211187 -0.031812 0.017984 0.013467 0.053261 -0.264651 0.006523
V6 -0.170560 -0.068908 -0.142545 0.330173 0.184414 -0.128285 1.000000 -0.388301 0.586180 0.117858 ... 0.037699 0.083301 -0.100281 -0.400347 0.002433 -0.065754 0.130008 0.022511 0.157488 0.000990
V7 0.111473 -0.433885 0.769223 -0.086539 0.101914 0.782008 -0.388301 1.000000 -0.334971 -0.120979 ... -0.044497 -0.103827 -0.265414 0.041796 -0.026315 0.031804 0.040198 0.149340 -0.117201 0.005647
V8 -0.186010 -0.452119 0.130582 0.417561 0.048369 -0.196368 0.586180 -0.334971 1.000000 0.024204 ... 0.079639 0.053019 -0.020607 -0.152549 -0.151529 -0.020628 0.231209 0.194429 0.033047 -0.001003
V9 0.146934 0.288310 -0.073881 -0.122859 0.283920 -0.043302 0.117858 -0.120979 0.024204 1.000000 ... 0.017362 0.054594 0.168975 0.022936 -0.045431 0.058850 -0.041965 -0.164740 -0.050677 -0.004078
V10 -0.007882 0.445023 -0.553563 -0.173788 -0.087289 -0.423449 0.083803 -0.477550 -0.217438 -0.470600 ... -0.052116 0.048983 0.174983 -0.047679 0.037267 -0.078132 -0.077551 -0.217124 0.114461 -0.003546
V11 -0.236306 0.019722 -0.010797 0.073995 -0.006415 -0.030536 0.099727 -0.055596 0.121963 -0.141010 ... 0.048885 -0.000403 0.069159 0.119546 0.013070 0.020900 -0.040857 -0.036118 -0.025044 0.004570
V12 0.032206 0.069376 0.148319 -0.042362 0.268600 0.119744 0.137279 0.088651 0.091836 0.242455 ... -0.019428 -0.021909 0.096152 0.058350 0.021282 0.004256 -0.003691 -0.034362 -0.047634 -0.003404
V13 0.013180 0.057120 0.019813 -0.009552 -0.072258 0.024709 0.000934 -0.009498 -0.211640 -0.146897 ... -0.048742 0.028746 0.002986 -0.031777 0.004273 0.005317 0.008804 0.013166 -0.012856 -0.003517
V14 -0.098426 -0.006022 0.120643 -0.089311 0.120402 0.091022 -0.121111 0.178257 0.035194 -0.022434 ... 0.084032 -0.000729 -0.001767 -0.042380 0.077851 0.005036 -0.110840 -0.057218 -0.023679 -0.012567
V15 -0.266930 0.053819 0.062259 0.111949 0.077587 -0.142691 -0.171105 -0.046796 -0.113585 0.021910 ... 0.051451 -0.011465 0.045359 0.002930 0.043396 -0.001009 0.002098 0.052868 -0.052826 0.000998
V16 0.005020 0.011258 0.026171 0.009006 -0.112167 -0.007170 -0.011445 -0.113516 0.103067 -0.012713 ... 0.213687 0.038081 0.001661 -0.102424 -0.051959 0.008470 -0.036289 0.006640 -0.074567 0.006280
V17 -0.102785 -0.020967 -0.056932 -0.031761 -0.008901 -0.165365 -0.061519 -0.130620 0.093612 -0.144079 ... -0.037357 0.014270 0.049259 0.213738 0.040237 -0.016323 0.051915 0.089018 0.049569 0.008973
V18 0.123139 -0.044440 -0.009454 -0.025746 -0.015652 0.003321 0.118668 -0.092701 0.148402 0.012010 ... -0.009102 0.051939 -0.122062 -0.114353 -0.066279 0.015687 0.023948 0.014886 0.068833 0.004756
V19 0.092284 0.037674 -0.023533 -0.131766 -0.119560 0.039804 0.036611 0.000852 -0.018928 0.044184 ... 0.009242 -0.020466 -0.078565 -0.173735 -0.013883 -0.030291 -0.020989 -0.057714 -0.024579 -0.008324
V20 -0.193463 -0.377559 0.220747 0.303830 0.005787 0.202354 0.040514 0.248920 0.039452 -0.020033 ... 0.202097 0.080750 -0.261388 -0.007695 0.051239 0.035376 0.235675 0.161490 0.251908 0.000007
V21 0.072079 -0.085250 -0.042753 -0.007480 -0.011274 0.002381 0.037699 -0.044497 0.079639 0.017362 ... 1.000000 0.920087 -0.408208 0.074965 0.138300 -0.065896 -0.009959 0.037102 0.152230 0.001031
V22 0.112380 -0.006150 -0.115434 -0.027404 -0.038167 -0.057916 0.083301 -0.103827 0.053019 0.054594 ... 0.920087 1.000000 -0.408591 0.073290 0.122673 -0.064878 0.031591 -0.026029 0.086033 0.000456
V23 0.239050 0.359266 -0.265716 -0.251142 0.020770 -0.234678 -0.100281 -0.265414 -0.020607 0.168975 ... -0.408208 -0.408591 1.000000 0.203740 -0.581289 -0.025379 -0.055669 -0.102295 -0.017705 -0.006103
V24 -0.000278 -0.011529 0.041263 0.099846 0.081798 -0.211187 -0.400347 0.041796 -0.152549 0.022936 ... 0.074965 0.073290 0.203740 1.000000 -0.064681 0.016947 -0.046394 0.063173 -0.015319 0.000291
V25 -0.326817 0.150539 -0.091146 0.002230 0.190120 -0.031812 0.002433 -0.026315 -0.151529 -0.045431 ... 0.138300 0.122673 -0.581289 -0.064681 1.000000 -0.060312 -0.173125 -0.146007 0.035212 0.003670
V26 -0.055266 -0.006017 0.016220 -0.028376 -0.101336 0.017984 -0.065754 0.031804 -0.020628 0.058850 ... -0.065896 -0.064878 -0.025379 0.016947 -0.060312 1.000000 -0.205459 -0.043104 -0.037231 -0.003176
V27 -0.067510 -0.266911 0.205829 0.260271 -0.044333 0.013467 0.130008 0.040198 0.231209 -0.041965 ... -0.009959 0.031591 -0.055669 -0.046394 -0.173125 -0.205459 1.000000 0.573672 -0.121647 -0.002895
V28 -0.202418 -0.388033 0.298994 0.342201 0.012608 0.053261 0.022511 0.149340 0.194429 -0.164740 ... 0.037102 -0.026029 -0.102295 0.063173 -0.146007 -0.043104 0.573672 1.000000 -0.016839 -0.000618
Amount -0.028983 0.007758 -0.391074 -0.002391 -0.044219 -0.264651 0.157488 -0.117201 0.033047 -0.050677 ... 0.152230 0.086033 -0.017705 -0.015319 0.035212 -0.037231 -0.121647 -0.016839 1.000000 -0.002288

30 rows × 31 columns

In [ ]:
highest_values = c_matrix.iloc[-1].sort_values(ascending=False)
features = highest_values.index[1:11]
print(features)
Index(['V4', 'V17', 'V2', 'V5', 'V16', 'V7', 'V18', 'V11', 'V25', 'V3'], dtype='object')

From the above code we can see that V4 has the highest correlation with the Class target variable

In [ ]:
corr_df = pd.DataFrame()

for cols in features:
    temp_df = data[cols]
    corr_df = pd.concat([corr_df, temp_df], axis=1)
    
temp_df = data['Class']
corr_df = pd.concat([corr_df, temp_df], axis=1)
sns.pairplot(corr_df)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x1606986d0>

The above code is used to choose the 10 features with the highest correlation with the tarrget variable, and plotting their distributions.

ML Model Training and Testing¶


  1. Splitting the cleaned dataset into training and test data
In [ ]:
# Splitting the data
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]:
model = LogisticRegression()
  1. Doing a 5-fold cross validation
In [ ]:
scores = cross_val_score(model, X_train, y_train, cv=5)
/Users/aagnaykariyal/Documents/Github/venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  1. Training the Logistic regression model
In [ ]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
  1. Printing the coefficients of the model
In [ ]:
print(model.coef_)
print('\n\n')
highest_coef = model.coef_.max()
ind = model.coef_[0]
ind = np.where(ind == highest_coef)[0]
print(f'Index {ind}: {col_names[ind][0]}')
[[-6.57658256e-05 -1.16703101e+00 -6.43524652e-01 -2.20152230e+00
   3.91423094e-02  4.23592631e-01  7.93887129e-01  2.71228748e-02
  -1.01643068e-01 -4.37429038e-01  3.50121991e-01 -1.97938753e-01
  -3.05169886e-01 -1.21550905e-01 -5.44924537e-01 -3.91367603e-01
   1.85033458e-01  5.62454114e-01  3.62881263e-01 -4.15527836e-02
   2.27389182e-01  2.39906827e-01  1.95715356e-01  2.42277553e-01
   3.76946804e-01 -6.04572924e-01  1.78265176e-01 -1.96695438e-01
  -4.26767168e-02 -2.92516425e-02]]



Index [6]: V6

From the above output we can see that V6 feature has the highest weight

  1. Evaluating the model's performance on the test set using accuracy, precision, recall and f1 scores
In [ ]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'{accuracy}')
print(f'{precision}')
print(f'{recall}')
print(f'{f1}')
0.9998633272969556
0.0
0.0
0.0
  1. Plotting the ROC curve and printing the AUC
In [ ]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC Curve)')
plt.legend(loc="lower right")
plt.show()

roc_auc = auc(fpr, tpr)
print(f'The area under the curve is {roc_auc}')
The area under the curve is 0.4999829141607436
  1. Applying compute_class_weight techniques to handle class imbalance
In [ ]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_test), y = y_test)
class_weights = dict(enumerate(class_weights))
class_weights
Out[ ]:
{0: 0.5000512575177692, 1: 4877.833333333333}
  1. Performing KNN and SVM Classficiation to compare it to the Logistic Regression Baseline
In [ ]:
# KNN Classification
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Printing the classification report and confusion matrix
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
KNN Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     29264
           1       0.00      0.00      0.00         3

    accuracy                           1.00     29267
   macro avg       0.50      0.50      0.50     29267
weighted avg       1.00      1.00      1.00     29267

KNN Confusion Matrix:
[[29264     0]
 [    3     0]]
/Users/aagnaykariyal/Documents/Github/venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/aagnaykariyal/Documents/Github/venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/aagnaykariyal/Documents/Github/venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)
print(f'Accuracy score of svm is : {svm_acc}')
Accuracy score of svm is : 0.9998974954727167

As we can see, the Accuracy score of SVM, KNN and the Logistic Regression Baseline is extremely similar

In [ ]: